data_pre<-data.frame(read.csv("loan_approval_dataset.csv"))
data<-data_pre
str(data)
## 'data.frame': 4269 obs. of 13 variables:
## $ loan_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ no_of_dependents : int 2 0 3 3 5 0 5 2 0 5 ...
## $ education : chr " Graduate" " Not Graduate" " Graduate" " Graduate" ...
## $ self_employed : chr " No" " Yes" " No" " No" ...
## $ income_annum : int 9600000 4100000 9100000 8200000 9800000 4800000 8700000 5700000 800000 1100000 ...
## $ loan_amount : int 29900000 12200000 29700000 30700000 24200000 13500000 33000000 15000000 2200000 4300000 ...
## $ loan_term : int 12 8 20 8 20 10 4 20 20 10 ...
## $ cibil_score : int 778 417 506 467 382 319 678 382 782 388 ...
## $ residential_assets_value: int 2400000 2700000 7100000 18200000 12400000 6800000 22500000 13200000 1300000 3200000 ...
## $ commercial_assets_value : int 17600000 2200000 4500000 3300000 8200000 8300000 14800000 5700000 800000 1400000 ...
## $ luxury_assets_value : int 22700000 8800000 33300000 23300000 29400000 13700000 29200000 11800000 2800000 3300000 ...
## $ bank_asset_value : int 8000000 3300000 12800000 7900000 5000000 5100000 4300000 6000000 600000 1600000 ...
## $ loan_status : chr " Approved" " Rejected" " Rejected" " Rejected" ...
NA_values <- sum(is.na(data))
NA_values
data<- subset(data_pre, select = -c(loan_id))
data$education <- as.factor(data$education)
data$self_employed<-as.factor(data$self_employed)
data$loan_status<-as.factor(data$loan_status)
str(data)
summary(data)
ggplot(data, aes(y = no_of_dependents, x = loan_status, fill = loan_status)) +
geom_boxplot(binwidth = 0.5, color = "black", alpha = 0.9) +
labs(
title = "Box plot of number of dependents and loan status",
x = "Loan Status",
y = "Number of Dependents"
) +
scale_fill_manual(values = c("#93C572", "#4682B4"))+theme_minimal()
data <- data %>%
mutate(self_employed = str_trim(self_employed))
data <- data %>%
mutate(education = str_trim(education))
data <- data %>%
mutate(loan_status = str_trim(loan_status))
data$education<-as.factor(data$education)
data$self_employed<-as.factor(data$self_employed)
data$loan_status<-as.factor(data$loan_status)
class(data$self_employed)
class(data$luxury_assets_value)
class(data$income_annum)
class(data$loan_status)
str(data)
data$self_employed <- factor(data$self_employed, levels = c("Yes", "No", "Other"))
ggplot(data, aes(x =loan_status, fill = self_employed)) +
geom_bar(position = "dodge", stat = "count") +
labs(title = "Stacked Bar between Loan Status and Self Employment",
x = "Loan Status",
y = "Count",
fill = "self_employed") + scale_fill_manual(values = c("Yes" = "#93C572", "No" = "#4682B4", "Other" = "gray")) +
theme_minimal()
data1 <- data
data1$self_employed <- as.integer(data$self_employed == "Yes")
data1$loan_status <- as.integer(data$loan_status == "Approved")
data1$education <- as.integer(data$education == "Graduate")
ggplot(data, aes(x = loan_term,fill=loan_status)) +
geom_density() +
theme_bw() +
theme() +
labs(
title = "Density plot of Loan Term based on Loan Status",
x = "Loan Term (Years)",
y = "Density of the Applicants"
)+scale_fill_manual(values = c("#93C572", "#4682B4")) + theme_minimal()
ggplot(data, aes(x= cibil_score, y = loan_amount, color = loan_status)) +
geom_point() +
labs(title = "Scatter Plot of CIBIL Score vs Loan Amount",
x = "CIBIL Score",
y = "Loan Amount") +
scale_color_manual(values = c("#93C572", "#4682B4")) + scale_x_continuous(breaks = seq(min(data$cibil_score), max(data$cibil_score), by = 50))+scale_y_continuous(labels = comma_format(scale = 1e-6,suffix = "M"), breaks = seq(0, 35000000, by = 5000000)) + theme_minimal()
ggplot(data, aes(x= residential_assets_value, y = loan_amount, color = loan_status)) +
geom_point() +
labs(title = "Scatter Plot of CIBIL Score vs Loan Amount",
x = "Residential Assets value",
y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()
ggplot(data, aes(x= commercial_assets_value, y = loan_amount, color = loan_status)) +
geom_point() +
labs(title = "Scatter Plot of Commercial assets value vs Loan Amount",
x = "Commercial assets value",
y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()
ggplot(data, aes(x= luxury_assets_value, y = loan_amount, color = loan_status)) +
geom_point() +
labs(title = "Scatter Plot of luxury assets value vs Loan Amount",
x = "Luxury Assets value",
y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()
ggplot(data, aes(x= bank_asset_value, y = loan_amount, color = loan_status)) +
geom_point() +
labs(title = "Scatter Plot of bank asset value vs Loan Amount",
x = "bank Assets value",
y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()
ggplot(data, aes(x= income_annum, y = loan_amount, color = loan_status)) +
geom_point() +
labs(title = "Scatter Plot of income annum vs Loan Amount",
x = "Income annum",
y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()
ggplot(data, aes(x = self_employed, y = cibil_score, fill=loan_status)) +
geom_boxplot(outlier.shape = NA) +
labs(title = "Box plot between loan status and cibil score",
x = "Self Employed",
y = "cibil score")+
scale_fill_manual(values = c("Approved" = "#93C572", "Rejected" = "#4682B4"))+
theme_minimal()
ggplot(data, aes(x = bank_asset_value, fill = loan_status)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Bank Assets grouped by Loan Status",
x = "Bank Assets",
y = "Density")+scale_fill_manual(values = c("Approved" = "#93C572", "Rejected" = "#4682B4"))+
theme_minimal()+scale_x_continuous(labels = comma_format(scale = 1e-7,suffix = "M"))
ggplot(data, aes(x = cibil_score, fill = loan_status)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Cibil Score grouped by Loan Status",
x = "CIBIL Score",
y = "Density")+scale_fill_manual(values = c("Approved" = "#93C572", "Rejected" = "#4682B4"))+
theme_minimal()
x <- cor(data1)
corrplot(x, type = "full", tl.cex = 0.7, method = "color", col = colorRampPalette(brewer.pal(6, "PuOr"))(100))
##
## Welch Two Sample t-test
##
## data: Approved$cibil_score and rejected$cibil_score
## t = 88, df = 4263, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 268 280
## sample estimates:
## mean of x mean of y
## 703 429
Null Hypothesis (\(H_{0}\)): CIBIL score has no significant association with loan status.
Alternate Hypothesis (\(H_{A}\)): CIBIL score has significant association with loan status.
The p-value \(0\), is
very less than the standard alpha value of 0.05, hence, we
reject the NULL hypothesis and conclude that CIBIL score has significant
association with the probability of loan approval.
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: c
## X-squared = 0.08, df = 1, p-value = 0.8
Null Hypothesis (\(H_{0}\)): Education level and loan status are independent of each other.
Alternate Hypothesis (\(H_{A}\)): Education level and loan status are dependent on each other.
Education level and loan status have a high p-value of
\(0.772\). Thus, we cannot reject null
hypothesis and by therefore accepting the null hypothesis, we conclude
that the education level of an applicant has no significant impact on
loan approval.
##
## Welch Two Sample t-test
##
## data: Approved$bank_asset_value and rejected$bank_asset_value
## t = -0.4, df = 3453, p-value = 0.7
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -245677 154809
## sample estimates:
## mean of x mean of y
## 4959526 5004960
Null Hypothesis (\(H_{0}\)): Bank asset value and loan status are independent of each other.
Alternative Hypothesis (\(H_{A}\)): Bank asset value and loan status are dependent on each other.
Education level and loan status have a high p-value of
\(0.656\). Thus, we cannot reject the
null hypothesis. We can therefore state that bank asset value and loan
status are independent of each other and are not significantly
associated.
##
## Welch Two Sample t-test
##
## data: Approved$residential_assets_value and rejected$residential_assets_value
## t = -0.9, df = 3400, p-value = 0.3
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -595310 209937
## sample estimates:
## mean of x mean of y
## 7399812 7592498
Null Hypothesis (\(H_{0}\)): There is no significant association between the values of residential asset and loan approval status.
Alternative Hypothesis (\(H_{A}\)):
There is a significant association between the values of residential
asset and loan approval status. With a p-value of \(0.348\), we cannot reject the null
hypothesis and thus, we conclude from the null hypothesis that there
exists no significant association between residential assets value and
loan status.
##
## Welch Two Sample t-test
##
## data: Approved$no_of_dependents and rejected$no_of_dependents
## t = -1, df = 3400, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1683 0.0416
## sample estimates:
## mean of x mean of y
## 2.47 2.54
Null Hypothesis (\(H_{0}\)): There is no significant association between the number of dependents and loan approval status.
Alternative Hypothesis (\(H_{A}\)): There is a significant association between the number of dependents and loan approval status.
With a p-value of \(0.237\), we cannot reject the null
hypothesis and thus, by accepting the null hypothesis, we can say that
there exists no significant association between number of dependents and
loan status.
##
## Welch Two Sample t-test
##
## data: a$luxury_assets_value and r$luxury_assets_value
## t = -1, df = 3442, p-value = 0.3
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -851752 271073
## sample estimates:
## mean of x mean of y
## 15016604 15306944
Null Hypothesis (\(H_{0}\)): The luxury assets value of an applicant has no significant association with their loan status.
Alternate Hypothesis (\(H_{A}\)): The luxury assets value of an applicant has significant association with their loan status.
With a p-value of \(0.311\), we cannot reject the null
hypothesis and thus, we conclude that the luxury assets value of an
applicant has no significant association with their loan status.
## Pearson Correlation Coefficient: 0.00844
## p-value: 0.582
The small value of the Pearson correlation coefficient \(0.008\) suggests a weak relationship
between loan_term and loan_amount. The high
p-value \(0.582\)
indicates that the resultant correlation is not statistically
significant.
library("leaps")
reg.best10 <- regsubsets(loan_amount~. , data = data, nvmax = 10, nbest = 2, method = "exhaustive")
plot(reg.best10, scale = "adjr2", main = "Adjusted R^2")
plot(reg.best10, scale = "r2", main = "R^2")
# In the "leaps" package, we can use scale=c("bic","Cp","adjr2","r2")
plot(reg.best10, scale = "bic", main = "BIC")
plot(reg.best10, scale = "Cp", main = "Cp")
summary(reg.best10)
The regsubsets() selection method aims to find the best
subset of predictor variables that minimizes or maximizes a chosen
criterion, such as Adjusted R-squared (adjr2), R-squared (r2), Bayesian
Information Criterion (BIC), or Mallows’ Cp.
In case of the Adjusted R-squared plot, the best possible set of
predictors are found to be: no_of_dependents,
loan_term, income_annum,
commercial_assests_value, cibil_score and
loan_status. In case of the R-squared plot, the best
possible set of predictors are found to be:
no_of_dependents, loan_term,
income_annum, commercial_assests_value,
cibil_score and loan_status. From the BIC plot
we can observe that the best possible set of predictors are found to be:
no_of_dependents, loan_term,
income_annum, residential_assets_value,
commercial_assests_value, cibil_score and
loan_status. From the Cp Mallow plot we can observe that
the best possible set of predictors are found to be:
no_of_dependents, income_annum,
residential_assets_value,
commercial_assests_value, cibil_score and
loan_status.
summaryRegForward = summary(reg.best10)
# Adjusted R2
car::subsets(reg.best10, statistic="adjr2", legend = FALSE, min.size = 7, main = "Adjusted R^2 Plot")
From the Adjusted R-squared based statistic plot, the most suitable set
of predictors are found to be:
no_of_dependents,
loan_term, income_annum,
commercial_assests_value, cibil_score,
commercial_assests_value, luxury_assests_value
and loan_status.
subsets(reg.best10, statistic="cp", legend = FALSE, min.size = 4, main = "Mallow Cp Plot")
abline(a = 1, b = 1, lty = 3)
The most relevant predictors from the Mallow Cp plot is found to be
no_of_dependents, income_annum,
commercial_assests_value, cibil_score and
loan_status.
library("bestglm")
res.bestglm <- bestglm(Xy = data, family = binomial,
IC = "AIC",
method = "exhaustive")
summary(res.bestglm)
res.bestglm$BestModels
summary(res.bestglm$BestModels)
In our comprehensive analysis, we adopt a dual-pronged approach to enhance the predictive capabilities of our model. Specifically, we employ both regression and classification techniques to predict distinct aspects of the loan application process—loan amount and loan status, respectively.
The dataset was initially explored to understand the distribution of
the target variable loan_status This binary classification
variable represents whether a loan was approved or not. To ensure model
generalization, the data was then split into training (80%) and test
(20%) sets using a seed for reproducibility.
table(data$loan_status)
table(data$loan_status)[2] / sum(table(data$loan_status))
set.seed(1)
data_train_rows = sample(1:nrow(data),
round(0.8 * nrow(data), 0),
replace = FALSE)
length(data_train_rows) / nrow(data)
data_train = data[data_train_rows, ]
data_test = data[-data_train_rows, ]
nrow(data_train)
nrow(data_test)
A linear regression model was constructed using the lm()
function in R, predicting loan_amount based on various features,
including no_of_dependents, loan_term,
income_annum, commercial_assets_value,
cibil_score, and loan_status.
model<-lm(loan_amount~no_of_dependents+loan_term+income_annum+commercial_assets_value+cibil_score+loan_status,data=data)
summary(model)
ezids::xkabledply(model,title = "Summary of loan amount prediction")
ezids::xkablevif(model)
The summary statistics and variance inflation factor (VIF) were analyzed for insights. which gives us values lesser than 3 which means there is no multicollinearity in our features.
model <- lm(loan_amount ~ no_of_dependents + loan_term + income_annum +
commercial_assets_value + cibil_score + loan_status, data = data_train)
train_predictions <- predict(model, newdata = data_train)
test_predictions <- predict(model, newdata = data_test)
plot_data <- data.frame(Actual = data_test$loan_amount, Predicted = test_predictions)
train_r_squared <- cor(data_train$loan_amount, train_predictions)^2
cat("Training R-squared:", train_r_squared, "\n")
## Training R-squared: 0.862
test_r_squared <- cor(data_test$loan_amount, test_predictions)^2
cat("Testing R-squared:", test_r_squared, "\n")
## Testing R-squared: 0.862
ggplot(plot_data, aes(x = Actual, y = Predicted)) +
geom_point(color="#93C572") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Actual vs Predicted Values",
x = "Actual Values",
y = "Predicted Values")+scale_x_continuous(labels = comma_format(scale = 1e-6,suffix = "M"), breaks = seq(0, 35000000, by = 5000000))+scale_y_continuous(labels = comma_format(scale = 1e-6,suffix = "M"), breaks = seq(0, 35000000, by = 5000000))+theme_minimal()
The scatter plot depicts the relationship between actual and predicted loan amounts, with a dashed red line marking the ideal prediction scenario. R-squared values of 0.862 for both training and testing highlight the model’s robust explanatory power and generalization to unseen data.
This study employs logistic regression to build a predictive model
for loan status based on key features in a given dataset. The logistic
regression model is constructed using the glm() function in
R, with an emphasis on variables such as the number of dependents,
annual income, loan amount, loan term, credit score (CIBIL score),
luxury assets value, and bank assets value. The summary output of the
model is analyzed to assess the significance and impact of each
predictor on loan status.
Logit <- glm(loan_status ~ no_of_dependents + income_annum+loan_amount+loan_term+cibil_score+luxury_assets_value+bank_asset_value, data = data_train, family = "binomial")
summary_output<-summary(Logit)
summary_output
The coefficients table reveals the estimated effects of each predictor on the log-odds of loan approval. Key findings include:
The intercept has a substantial positive effect on the log-odds.
Variables such as loan_term and CIBIL_score
significantly impact loan approval, as indicated by their respective
z-values and low p-values. The no_of_dependents,
income_annum, loan_amount,
luxury_assets_value, and bank_asset_value show
minimal impact on loan approval.
ezids::xkabledply(Logit, title =" Summary of logistic Regression for loan status")
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | 11.2729 | 0.4762 | 23.671 | 0.000 |
| no_of_dependents | 0.0066 | 0.0385 | 0.171 | 0.864 |
| income_annum | 0.0000 | 0.0000 | 5.512 | 0.000 |
| loan_amount | 0.0000 | 0.0000 | -6.715 | 0.000 |
| loan_term | 0.1491 | 0.0127 | 11.718 | 0.000 |
| cibil_score | -0.0246 | 0.0009 | -26.587 | 0.000 |
| luxury_assets_value | 0.0000 | 0.0000 | -1.353 | 0.176 |
| bank_asset_value | 0.0000 | 0.0000 | -1.097 | 0.273 |
expcoeff = exp(coef(Logit))
# expcoeff
ezids::xkabledply( as.table(expcoeff), title = "Exponential of coefficients in Logit Reg" )
| x | |
|---|---|
| (Intercept) | 7.87e+04 |
| no_of_dependents | 1.01e+00 |
| income_annum | 1.00e+00 |
| loan_amount | 1.00e+00 |
| loan_term | 1.16e+00 |
| cibil_score | 9.76e-01 |
| luxury_assets_value | 1.00e+00 |
| bank_asset_value | 1.00e+00 |
Feature importance summary:
# prediction
data_train$prediction <- predict( Logit, newdata = data_train, type = "response" )
data_test$prediction <- predict( Logit, newdata = data_test , type = "response" )
# distribution of the prediction score grouped by known outcome
ggplot( data_train, aes( prediction, color = as.factor(loan_status) ) ) +
geom_density( size = 1 ) +
ggtitle( "Training Set's Predicted Score" )+ labs(color = "Loan Status")
train_predictions <- predict(Logit, newdata = data_train, type = "response")
train_predictions_class <- ifelse(train_predictions > 0.49, 1, 0)
train_conf_matrix <- table(Predicted = train_predictions_class, Actual = data_train$loan_status)
train_accuracy <- sum(diag(as.matrix(train_conf_matrix))) / sum(train_conf_matrix)
print(paste("Training Accuracy:", round(train_accuracy * 100, 2), "%"))
## [1] "Training Accuracy: 91.45 %"
test_predictions <- predict(Logit, newdata = data_test, type = "response")
test_predictions_class <- ifelse(test_predictions > 0.49, 1, 0)
test_conf_matrix <- table(Predicted = test_predictions_class, Actual = data_test$loan_status)
test_accuracy <- sum(diag(as.matrix(test_conf_matrix))) / sum(test_conf_matrix)
print(paste("Test Accuracy:", round(test_accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 93.09 %"
train_precision <- train_conf_matrix[2, 2] / sum(train_conf_matrix[, 2])
print(paste("Training Precision:", round(train_precision*100, 2), "%"))
## [1] "Training Precision: 88.92 %"
train_recall <- train_conf_matrix[2, 2] / sum(train_conf_matrix[2, ])
print(paste("Training Recall:", round(train_recall*100, 2), "%"))
## [1] "Training Recall: 88.51 %"
test_precision <- test_conf_matrix[2, 2] / sum(test_conf_matrix[, 2])
print(paste("Test Precision:", round(test_precision*100, 2), "%"))
## [1] "Test Precision: 90.68 %"
test_recall <- test_conf_matrix[2, 2] / sum(test_conf_matrix[2, ])
print(paste("Test Recall:", round(test_recall*100, 2), "%"))
## [1] "Test Recall: 90.97 %"
In the logistic regression model, the following performance metrics were observed:
These metrics provide an overview of the model’s ability to correctly predict loan approval status. The high accuracy and precision scores indicate a strong predictive performance. Additionally, balanced recall scores suggest that the model effectively captures both approved and rejected instances. This comprehensive evaluation demonstrates the logistic regression model’s robustness in making accurate predictions on both the training and test datasets.
library("regclass")
# confusion_matrix(admitLogit)
ezids::xkabledply( confusion_matrix(Logit), title = "Confusion matrix from Logit Model" )
| Predicted Approved | Predicted Rejected | Total | |
|---|---|---|---|
| Actual Approved | 1979 | 145 | 2124 |
| Actual Rejected | 150 | 1141 | 1291 |
| Total | 2129 | 1286 | 3415 |
This confusion matrix provides a detailed breakdown of the model’s
predictions and actual outcomes for the two classes (Approved and
Rejected). It includes values for True Positives (1979),
False Positives (145), False Negatives (150),
and True Negatives (1141). These metrics are useful for
assessing the model’s performance, calculating various evaluation
measures such as precision, recall, and accuracy.
Receiver-Operator-Characteristic (ROC) curve and Area-Under-Curve (AUC) measures the true positive rate (or sensitivity) against the false positive rate (or specificity). The area-under-curve is always between 0.5 and 1. Values higher than 0.8 is considered good model fit.
library("pROC")
prob=predict(Logit, type = "response" )
data_train$prob=prob
h <- roc(loan_status~prob, data=data_train)
roc_curve=auc(h) # area-under-curve prefer 0.8 or higher.
k_logit=roc_curve
#plot(h)
plot(h, main = "ROC Curve", col = "gold", lwd = 2)
text(0.8, 0.2, paste("AUC =", round(auc(h), 3)), col = "black")
# unloadPkg("pROC")
NullLogit <- glm(loan_status ~ 1, data = data, family = "binomial")
mcFadden = 1 - logLik(Logit)/logLik(NullLogit)
mcFadden
In logistic regression, the log likelihood value is 0.729 with 8 degrees of freedom. This is used to calculate McFadden’s pseudo R-squared, indicating how well the model fits the data compared to a simple intercept-only model. A higher pseudo R-squared value suggests a better fit, but there’s no universal threshold. The log likelihood of 0.729 contributes to assessing the goodness-of-fit in our logistic regression analysis.
data_train$self_employed<-as.numeric(data_train$self_employed)
data_test$self_employed<-as.numeric(data_test$self_employed)
str(data_train)
library("class")
library("ggplot2")
chooseK = function(k, train_set, val_set, train_class, val_class){
set.seed(1)
class_knn = knn(train = train_set,
test = val_set,
cl = train_class,
k = k)
tab = table(class_knn, val_class)
accu = sum(tab[row(tab) == col(tab)]) / sum(tab)
cbind(k = k, accuracy = accu)
}
knn_different_k = sapply(seq(1, 21, by = 2),
function(x) chooseK(x,
train_set = data_train[, c("no_of_dependents","self_employed" ,"income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
val_set = data_test[, c("no_of_dependents","self_employed" ,"income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
train_class = data_train[, "loan_status"],
val_class = data_test[, "loan_status"]))
str(knn_different_k)
## num [1:2, 1:11] 1 0.542 3 0.573 5 ...
knn_different_k = data.frame(k = knn_different_k[1,],
accuracy = knn_different_k[2,])
ggplot(knn_different_k,
aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3) +
labs(title = "accuracy vs k")+theme_minimal()
train_predictions <- knn(train = data_train[, c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
test = data_train[, c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
cl = data_train[, "loan_status"],
k = 18)
train_conf_matrix <- table(Predicted = train_predictions, Actual = data_train$loan_status)
print(train_conf_matrix)
## Actual
## Predicted Approved Rejected
## Approved 1855 980
## Rejected 269 311
train_accuracy <- sum(diag(as.matrix(train_conf_matrix))) / sum(train_conf_matrix)
print(paste("Training Accuracy:", round(train_accuracy * 100, 2), "%"))
## [1] "Training Accuracy: 63.43 %"
library("class")
set.seed(1)
bank_18NN = knn(train = data_train[,c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],test =data_test[,c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],cl = data_train[, "loan_status"],k = 18)
str(bank_18NN)
## Factor w/ 2 levels "Approved","Rejected": 1 1 1 1 1 2 1 1 2 2 ...
length(bank_18NN)
## [1] 854
table(bank_18NN)
## bank_18NN
## Approved Rejected
## 704 150
conf_matrix<-table(Predicted = bank_18NN, Actual = data_test$loan_status)
accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Test Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 61.12 %"
library("gmodels")
IRISPREDCross <- CrossTable(data_test[,"loan_status"], bank_18NN, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 854
##
##
## | bank_18NN
## data_test[, "loan_status"] | Approved | Rejected | Row Total |
## ---------------------------|-----------|-----------|-----------|
## Approved | 452 | 80 | 532 |
## | 0.850 | 0.150 | 0.623 |
## | 0.642 | 0.533 | |
## | 0.529 | 0.094 | |
## ---------------------------|-----------|-----------|-----------|
## Rejected | 252 | 70 | 322 |
## | 0.783 | 0.217 | 0.377 |
## | 0.358 | 0.467 | |
## | 0.295 | 0.082 | |
## ---------------------------|-----------|-----------|-----------|
## Column Total | 704 | 150 | 854 |
## | 0.824 | 0.176 | |
## ---------------------------|-----------|-----------|-----------|
##
##
library("caret")
cm = confusionMatrix(bank_18NN, reference = as.factor(data_test[, "loan_status"]) )
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Approved Rejected
## Approved 452 252
## Rejected 80 70
##
## Accuracy : 0.611
## 95% CI : (0.578, 0.644)
## No Information Rate : 0.623
## P-Value [Acc > NIR] : 0.771
##
## Kappa : 0.075
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.850
## Specificity : 0.217
## Pos Pred Value : 0.642
## Neg Pred Value : 0.467
## Prevalence : 0.623
## Detection Rate : 0.529
## Detection Prevalence : 0.824
## Balanced Accuracy : 0.534
##
## 'Positive' Class : Approved
##
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
print(paste("Precision:", round(precision, 2)))
## [1] "Precision: 0.64"
print(paste("Recall:", round(recall, 2)))
## [1] "Recall: 0.85"
library("class")
library("pROC")
set.seed(1)
roc_curve <- roc(ifelse(data_test[, "loan_status"] == "Approved", 1, 0), ifelse(bank_18NN == "Approved", 1, 0))
plot(roc_curve, main = "ROC Curve", col = "gold", lwd = 2)
text(0.8, 0.2, paste("AUC =", round(auc(roc_curve), 3)), col = "black")
abline(a = 0, b = 1, col = "gray", lty = 2)
k<-auc(roc_curve)
library(rpart)
tree_model <- rpart(loan_status ~ no_of_dependents + self_employed + income_annum +
loan_amount + loan_term + cibil_score + luxury_assets_value +
bank_asset_value, data = data_train, method = "class")
tree_predictions <- predict(tree_model, newdata = data_train, type = "class")
conf_matrix<-table(tree_predictions, data_train$loan_status)
accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Train Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Train Accuracy: 96.81 %"
library(rpart)
tree_model <- rpart(loan_status ~ no_of_dependents + self_employed + income_annum +
loan_amount + loan_term + cibil_score + luxury_assets_value +
bank_asset_value, data = data_train, method = "class")
tree_predictions <- predict(tree_model, newdata = data_test, type = "class")
conf_matrix<-table(tree_predictions, data_test$loan_status)
accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Test Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 96.49 %"
library("caret")
cm = confusionMatrix(tree_predictions, reference = as.factor(data_test[, "loan_status"]) )
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Approved Rejected
## Approved 529 27
## Rejected 3 295
##
## Accuracy : 0.965
## 95% CI : (0.95, 0.976)
## No Information Rate : 0.623
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.924
##
## Mcnemar's Test P-Value : 2.68e-05
##
## Sensitivity : 0.994
## Specificity : 0.916
## Pos Pred Value : 0.951
## Neg Pred Value : 0.990
## Prevalence : 0.623
## Detection Rate : 0.619
## Detection Prevalence : 0.651
## Balanced Accuracy : 0.955
##
## 'Positive' Class : Approved
##
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
print(paste("Precision:", round(precision, 2)))
## [1] "Precision: 0.95"
print(paste("Recall:", round(recall, 2)))
## [1] "Recall: 0.99"
library(pROC)
tree_predictions<-as.numeric(tree_predictions)
data_test$loan_status<-as.numeric(data_test$loan_status)
roc_curve2 <- roc(data_test$loan_status, tree_predictions)
plot(roc_curve2, main = "ROC Curve", col = "gold", lwd = 2)
auc_value <- auc(roc_curve2)
text(0.8, 0.2, paste("AUC =", round(auc_value, 3)), col = "black", cex = 1.2)
cat("AUC:", auc_value, "\n")
## AUC: 0.955
print(paste("AUC score of KNN", k))
## [1] "AUC score of KNN 0.533507682249101"
print(paste("AUC score of decision Tress", auc_value))
## [1] "AUC score of decision Tress 0.95525498528931"
print(paste("AUC score of Logistic regressor", k_logit))
## [1] "AUC score of Logistic regressor 0.96766802184032"
library(randomForest)
rf_model <- randomForest(loan_status ~ no_of_dependents + income_annum + loan_amount +
loan_term + cibil_score + luxury_assets_value + bank_asset_value,
data = data_train, ntree = 500, importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = loan_status ~ no_of_dependents + income_annum + loan_amount + loan_term + cibil_score + luxury_assets_value + bank_asset_value, data = data_train, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 1.58%
## Confusion matrix:
## Approved Rejected class.error
## Approved 2108 16 0.00753
## Rejected 38 1253 0.02943
feature_importance <- importance(rf_model)
print(feature_importance)
## Approved Rejected MeanDecreaseAccuracy MeanDecreaseGini
## no_of_dependents 1.33 4.34 3.74 15.8
## income_annum 15.75 14.63 21.43 37.2
## loan_amount 26.23 17.25 32.05 58.8
## loan_term 87.47 80.78 103.77 95.8
## cibil_score 383.23 399.68 440.78 1328.5
## luxury_assets_value 12.77 10.13 16.37 38.7
## bank_asset_value 10.79 9.59 14.86 29.8
rf_predictions <- predict(rf_model, newdata = data_train)
conf_matrix<-table(rf_predictions, data_train$loan_status)
accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Train Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Train Accuracy: 100 %"
rf_predictions <- predict(rf_model, newdata = data_test, type = "class")
conf_matrix<-table(rf_predictions, data_test$loan_status)
accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Test Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 98.59 %"
library(pROC)
rf_predictions<-as.numeric(rf_predictions)
data_test$loan_status<-as.numeric(data_test$loan_status)
roc_curve2 <- roc(data_test$loan_status, rf_predictions)
plot(roc_curve2, main = "ROC Curve", col = "gold", lwd = 2)
auc_value <- auc(roc_curve2)
text(0.8, 0.2, paste("AUC =", round(auc_value, 3)), col = "black", cex = 1.2)
cat("AUC:", auc_value, "\n")
## AUC: 0.983
auc_values <- c(0.5335, 0.9553, 0.9677,0.9832)
model_names <- c("KNN", "Decision Tree", "Logistic Regression","Random Forest")
plot(auc_values, type = "o", col = "gold", pch = 16, lty = 1,
xlab = "Model", ylab = "AUC Score",
main = "AUC Score Progression", xaxt = "n")
grid()
axis(side = 1, at = 1:length(model_names), labels = model_names)
abline(h = 0.8, col = "red", lty = 2)